20211009_10x_LY adv1110

only ILC2 for topic modeling

source("../../../analysis.10x.r")
GEX.seur <- readRDS("adv_1110.GEX.seur_ILC2.Anno.rds")
GEX.seur
## An object of class Seurat 
## 15083 features across 9076 samples within 1 assay 
## Active assay: RNA (15083 features, 800 variable features)
##  3 dimensional reductions calculated: pca, tsne, umap

umap

pumap.2 <- DimPlot(GEX.seur, reduction = "umap", label = F,group.by = "SP.info",
        cols = c("#5A5C5F","#0000C8","#FDB911"))+ labs(title="")+ theme(aspect.ratio = 0.75)
pumap.1 <- DimPlot(GEX.seur, reduction = "umap", label = T, group.by = "seurat_clusters") + NoLegend()+ labs(title="")+ theme(aspect.ratio = 0.75)
pumap.2 + pumap.1

pumap.3 <- DimPlot(GEX.seur, group.by = "SP.info", split.by = "SP.info",cols = c("#5A5C5F","#0000C8","#FDB911"), ncol = 3)+ theme(aspect.ratio = 0.75) + 
  labs(title = "") + NoLegend()
pumap.3

pumap.4 <- DimPlot(GEX.seur, group.by = "SP.info", split.by = "SP.info",cols = c("#5A5C5F","#0000C8","#FDB911"), ncol = 1)+ 
  labs(title = "") +
  theme(aspect.ratio = 0.75) + 
  NoLegend()
pumap.4

topic model

basically follow https://bitbucket.org/jerry00/mouse_small_intestine_immune_cell_atlas/src/master/script/topicModeling_PP.R

source("topicmodel.r")

calculate topics

# using umap coordinates
GEX.seur@reductions[['umap']]@cell.embeddings[1:5,1:2]
##                        UMAP_1     UMAP_2
## AAACCCAAGCATCCTA-1 -1.1394443  3.0657358
## AAACCCAAGGGCAACT-1 -3.0438967 -0.2278638
## AAACCCAAGGTGCCAA-1 -3.9796829 -1.0369316
## AAACCCACAGAAGTGC-1 -0.7091646 -1.2534755
## AAACCCAGTAGGGTAC-1  4.3127932  1.5042106
# to exclude ribosomal
ribo.loc <- grepl("Rpl|Rps", rownames(GEX.seur@assays[['RNA']]@counts))
head(rownames(GEX.seur@assays[['RNA']]@counts)[ribo.loc])
## [1] "Rpl7"    "Rpl31"   "Rpl37a"  "Rps6kc1" "Rpl7a"   "Rpl12"
sum(ribo.loc)
## [1] 96
# Build topic models and generate assocaited figures 
# Recommended to build models for k = 3,4,5,6,7,8,9,10 and tolerance 0.1 (set as inputs to this function)
# each K would take hours, bigger K, longer time
# so slow, would run into three sub-scripts from 3 to 10
#for(kk in c(3,4,5,6,7,8,9,10)){
#  kkk <- kk
#  if(kk %in% c(4,6,8)){
#    kkk <- paste0(0,kk)
#  }
#  FitGoM(t(as.matrix(GEX.seur@assays[['RNA']]@counts)[!ribo.loc,]),
#         K = kk, tol = 0.1,
#         path_rda =paste0("topicmodel/test_topic.n",kkk,"_t0.1.rda"))
#}
# after topic models are done, ranged K, 
# here could directly load the saved object
# the default var is 'Topic_clus'
load("topicmodel/test_topic.n08_t0.1.rda")
omega <- as.data.frame(Topic_clus$omega)
colnames(omega) <- paste0("Topic_", colnames(omega))
GEX.seur <- AddMetaData(GEX.seur, omega)
head(GEX.seur@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt  FB.info
## AAACCCAAGCATCCTA-1   ILC2_GEX      13805         3251   3.940601 hashtag6
## AAACCCAAGGGCAACT-1   ILC2_GEX       5200         2039   3.518554 hashtag5
## AAACCCAAGGTGCCAA-1   ILC2_GEX       4848         1503   3.361518 hashtag1
## AAACCCACAGAAGTGC-1   ILC2_GEX       5665         1673   4.165931 hashtag3
## AAACCCAGTAGGGTAC-1   ILC2_GEX       3458         1574   1.098583 hashtag5
## AAACCCATCGGTCGAC-1   ILC2_GEX       3298         1169   4.093390 hashtag6
##                        S.Score  G2M.Score Phase DoubletFinder0.05
## AAACCCAAGCATCCTA-1 -0.08767960 -0.3449294    G1           Singlet
## AAACCCAAGGGCAACT-1 -0.04374438 -0.1981341    G1           Singlet
## AAACCCAAGGTGCCAA-1 -0.03750102 -0.1720957    G1           Singlet
## AAACCCACAGAAGTGC-1 -0.11390714 -0.2737016    G1           Singlet
## AAACCCAGTAGGGTAC-1 -0.03319338 -0.1208448    G1           Singlet
## AAACCCATCGGTCGAC-1 -0.05704899 -0.2030185    G1           Singlet
##                    DoubletFinder0.1 preAnno_1 preAnno_2  SP.info
## AAACCCAAGCATCCTA-1          Singlet      ILC2    ILC2_4 IL-33_DA
## AAACCCAAGGGCAACT-1          Singlet      ILC2    ILC2_3    IL-33
## AAACCCAAGGTGCCAA-1          Singlet      ILC2    ILC2_3      PBS
## AAACCCACAGAAGTGC-1          Singlet      ILC2    ILC2_1    IL-33
## AAACCCAGTAGGGTAC-1          Singlet      ILC2    ILC2_2    IL-33
## AAACCCATCGGTCGAC-1          Singlet      ILC2    ILC2_3 IL-33_DA
##                    RNA_snn_res.0.6 seurat_clusters Anno_ILC2     Topic_1
## AAACCCAAGCATCCTA-1               1               1    ILC2_1 0.331038476
## AAACCCAAGGGCAACT-1               0               0    ILC2_0 0.320312192
## AAACCCAAGGTGCCAA-1               0               0    ILC2_0 0.486266747
## AAACCCACAGAAGTGC-1               0               0    ILC2_0 0.388178809
## AAACCCAGTAGGGTAC-1               3               3    ILC2_3 0.000364474
## AAACCCATCGGTCGAC-1               0               0    ILC2_0 0.378962662
##                      Topic_2      Topic_3     Topic_4    Topic_5    Topic_6
## AAACCCAAGCATCCTA-1 0.1559088 0.0826200026 0.004865470 0.17464599 0.01494368
## AAACCCAAGGGCAACT-1 0.2143718 0.0007200549 0.018321514 0.35710747 0.04640966
## AAACCCAAGGTGCCAA-1 0.2110820 0.0461481542 0.019537277 0.16199396 0.04409882
## AAACCCACAGAAGTGC-1 0.1497532 0.1755347758 0.001025182 0.07544155 0.15852893
## AAACCCAGTAGGGTAC-1 0.1149274 0.0018554194 0.426847626 0.16946351 0.13176387
## AAACCCATCGGTCGAC-1 0.3986307 0.0996796307 0.006114663 0.07988980 0.02645965
##                        Topic_7     Topic_8
## AAACCCAAGCATCCTA-1 0.199101457 0.036876086
## AAACCCAAGGGCAACT-1 0.039479830 0.003277505
## AAACCCAAGGTGCCAA-1 0.022333614 0.008539477
## AAACCCACAGAAGTGC-1 0.036283259 0.015254290
## AAACCCAGTAGGGTAC-1 0.153345182 0.001432542
## AAACCCATCGGTCGAC-1 0.005545364 0.004717507

plot

plot, for better visualization and only for visualization, set outliers as max inliers.
tk <- 8
tp <- paste0("Topic_",1:tk)
outlier <- lapply(tp, function(x) which(GEX.seur@meta.data[,x]>(mean(GEX.seur@meta.data[,x], na.rm=TRUE)+
                                                                  3*sd(GEX.seur@meta.data[,x], na.rm=TRUE))))
names(outlier) <- tp
#
max.nooutlier <- lapply(tp, function(x) if(length(unlist(outlier[x]))){
                                          ceiling(10*max(GEX.seur@meta.data[-unlist(outlier[x]),x], na.rm=TRUE))/10  
                                        }else{
                                          ceiling(10*max(GEX.seur@meta.data[,x], na.rm=TRUE))/10
                                        })
names(max.nooutlier) <- tp
build a new seurat object to apply the change
GEX.seur_override <- GEX.seur
for(xx in tp){
  GEX.seur_override@meta.data[unlist(outlier[xx]),xx] <- rep(unlist(max.nooutlier[xx]))
}

tsne/umap

generate list of tSNE plots
p_umap <- lapply(tp, function(x) plot.umap.feature(GEX.seur_override,
                                              x,
                                              "meta",
                                              pt.size = 0.5,
                                              ncols = 1,
                                              title = "weight",
                                              lower = 0,
                                              upper = unlist(max.nooutlier[x]),
                                              na.color = "gray") + theme(aspect.ratio = 0.75))
## Warning: package 'plotly' was built under R version 4.0.4
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:XVector':
## 
##     slice
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p_umap
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

cowplot::plot_grid(plotlist = p_umap, ncol = 4)

separate conditions

still in umap

PBS

p_umap.PBS <- lapply(tp, function(x) plot.umap.feature(subset(GEX.seur_override, subset=SP.info=="PBS"),
                                              x,
                                              "meta",
                                              pt.size = 0.5,
                                              ncols = 1,
                                              title = "weight",
                                              lower = 0,
                                              upper = unlist(max.nooutlier[x]),
                                              na.color = "gray") + theme(aspect.ratio = 0.75))
p_umap.PBS
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

cowplot::plot_grid(plotlist = p_umap.PBS, ncol = 4)

IL33

p_umap.IL33 <- lapply(tp, function(x) plot.umap.feature(subset(GEX.seur_override, subset=SP.info=="IL-33"),
                                              x,
                                              "meta",
                                              pt.size = 0.5,
                                              ncols = 1,
                                              title = "weight",
                                              lower = 0,
                                              upper = unlist(max.nooutlier[x]),
                                              na.color = "gray") + theme(aspect.ratio = 0.75))
p_umap.IL33
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

cowplot::plot_grid(plotlist = p_umap.IL33, ncol = 4)

IL33_DA

p_umap.IL33_DA <- lapply(tp, function(x) plot.umap.feature(subset(GEX.seur_override, subset=SP.info=="IL-33_DA"),
                                              x,
                                              "meta",
                                              pt.size = 0.5,
                                              ncols = 1,
                                              title = "weight",
                                              lower = 0,
                                              upper = unlist(max.nooutlier[x]),
                                              na.color = "gray") + theme(aspect.ratio = 0.75))
p_umap.IL33_DA
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

cowplot::plot_grid(plotlist = p_umap.IL33_DA, ncol = 4)

topic score

collect gene scores for each topic
theta <- as.data.frame(Topic_clus$theta)
colnames(theta) <- paste0("Topic_", 1:tk)

feat.topic <- ExtractTopFeatures(theta, top_features = 100)

feat.topic_genes <- as.data.frame(sapply(1:tk, function(x) rownames(theta)[feat.topic$indices[x,]]))
colnames(feat.topic_genes) <- paste0("topic_", 1:tk)

feat.topic_scores <- as.data.frame(feat.topic$scores)
rownames(feat.topic_scores) <- paste0("topic_", 1:tk)

topic top genes

visualize the top 40 genes of each topic
plot.feat.topic <- data.frame()
for(j in 1:nrow(feat.topic_scores)){
  current.topic <- as.data.frame(t(feat.topic_scores[j,]))
  colnames(current.topic) <- "score"
  rownames(current.topic) <- feat.topic_genes[,j]
  current.topic$genes <- feat.topic_genes[,j]
  current.topic.top <- current.topic[1:40,]
  current.topic.top$topic <- rep(colnames(feat.topic_genes)[j], nrow(current.topic.top))
  current.topic.top$genes <- factor(current.topic.top$genes, levels=rev(current.topic.top$genes))
  current.topic.top$order <- paste0(current.topic.top$genes, "_", current.topic.top$topic)
  current.topic.top$order <- factor(current.topic.top$order, levels=rev(current.topic.top$order))
  
  plot.feat.topic <- rbind(plot.feat.topic, current.topic.top)
}

plot.feat.topic$topic <- factor(plot.feat.topic$topic, levels=unique(plot.feat.topic$topic))

bar plots

p_bar <- ggplot(data = plot.feat.topic, aes(x = order, y = score)) +
           geom_bar(stat = "identity") +
           coord_flip() +
           theme(axis.text.x = element_text(angle = 45, hjust = 1),
                 axis.text = element_text(size = 5), text = element_text(size = 8),
                 axis.line = element_line(colour = "black"),
                 panel.grid.major = element_blank(),
                 panel.grid.minor = element_blank(),
                 panel.border = element_blank(),
                 panel.background = element_blank(), strip.background = element_rect(color = "white", fill = "white")) +
           facet_wrap(~topic, scales="free", ncol = 4) +
           scale_x_discrete(breaks = plot.feat.topic$order, labels = plot.feat.topic$genes)
  
p_bar        

plot expression of top 50 genes

topic.genes <- lapply(tolower(tp), function(x) plot.feat.topic$genes[which(plot.feat.topic$topic==x)])
names(topic.genes) <- tolower(tp)

for(t in 1:tk){
  p_genes <- lapply(unlist(topic.genes[t]), function(y) plot.umap.feature(GEX.seur, y, "gene",
                                                                          pt.size = 0.5,
                                                                          ncols = 1, title = "expression") + theme(aspect.ratio = 0.75))
  # arrange plots on a grid
  cowplot::plot_grid(plotlist=p_genes, ncol=4)
  
  # ggsave
  ggsave(paste0("topicmodel/K",tk,"/topics_top40_expression_on_UMAP_",tp[t],"_k",tk,"_tol0.1.pdf"),
         width = 13, height = 20, units = "in")
}

clusters vs topics

all cells

## code from Dianyu's github
#    https://github.com/chendianyu/2021_NI_scGCB
## Connect clustering and topic model
cluster_topic_mtx <- GEX.seur_override@meta.data %>% 
    #filter(seurat_clusters != 6) %>% 
    gather(key = "topic", value = "weight", paste0("Topic_", 1:8), factor_key = T) %>%
    group_by(Anno_ILC2, topic) %>% 
    summarise(weight = mean(weight)) %>%
    spread(key = topic, value = weight) 
cluster_topic_mtx <- column_to_rownames(cluster_topic_mtx, var = "Anno_ILC2") 
topic_order <- order(max.col(t(apply(cluster_topic_mtx, 2, scale))))
pheatmap::pheatmap(t(cluster_topic_mtx)[rev(c(8,4,3,7,1,2,5,6)),c(2,1,3,4,5,6)],
         cluster_rows = F,
         cluster_cols = F,
         scale = "row",
         main= "Relation of Topics and Clusters")

## code from Dianyu's github
#    https://github.com/chendianyu/2021_NI_scGCB
## Connect clustering and topic model
plot_topic_cluster <- function(obj,topic){
    obj@meta.data %>% 
        #filter(seurat_clusters!=6) %>% 
        ggplot(aes_string("seurat_clusters", paste0("Topic_", topic), color = "seurat_clusters")) +
        geom_violin(draw_quantiles=c(0.25,0.75)) +
        ggbeeswarm::geom_quasirandom(size = 0.01, width = 0.3, alpha = 0.5) +
        stat_summary(fun.y=mean, geom="point", shape=18, size=1, color="black") +
        theme(panel.grid.major = element_blank(), 
              panel.grid.minor = element_blank(), 
              panel.background = element_blank(), 
              panel.border = element_rect(fill = NA))
}
pvnl <- list()
for(tt in 1:tk){
  pvnl[[tt]] <- plot_topic_cluster(GEX.seur_override,tt) + labs(x="")
}

pvnl
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

cowplot::plot_grid(plotlist=pvnl, ncol=4)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

compare phenotypes

#### subset Seurat object for condtions
## PBS
topic.avg.PBS <- sapply(tp, function(x){
  mean(GEX.seur_override@meta.data[GEX.seur_override$SP.info=="PBS",x])
})
names(topic.avg.PBS) <- tp

## IL33
topic.avg.IL33 <- sapply(tp, function(x){
  mean(GEX.seur_override@meta.data[GEX.seur_override$SP.info=="IL-33",x])
})
names(topic.avg.IL33) <- tp

## IL33_DA
topic.avg.IL33_DA <- sapply(tp, function(x){
  mean(GEX.seur_override@meta.data[GEX.seur_override$SP.info=="IL-33_DA",x])
})
names(topic.avg.IL33_DA) <- tp

## combine
topic.avg.all <- rbind(topic.avg.PBS,topic.avg.IL33,topic.avg.IL33_DA)
rownames(topic.avg.all) <- c("PBS","IL33","IL33_DA")
####
topic.avg.PBS
##    Topic_1    Topic_2    Topic_3    Topic_4    Topic_5    Topic_6    Topic_7 
## 0.26888025 0.18235533 0.12535357 0.11290707 0.09042002 0.11799436 0.06895272 
##    Topic_8 
## 0.03166280
topic.avg.IL33
##    Topic_1    Topic_2    Topic_3    Topic_4    Topic_5    Topic_6    Topic_7 
## 0.27819400 0.15939374 0.09620652 0.09054758 0.11241967 0.10394656 0.11728610 
##    Topic_8 
## 0.04010939
topic.avg.IL33_DA
##    Topic_1    Topic_2    Topic_3    Topic_4    Topic_5    Topic_6    Topic_7 
## 0.25980210 0.16610987 0.12375749 0.12401408 0.10625613 0.09756779 0.08052668 
##    Topic_8 
## 0.03996384
topic.avg.all
##           Topic_1   Topic_2    Topic_3    Topic_4    Topic_5    Topic_6
## PBS     0.2688802 0.1823553 0.12535357 0.11290707 0.09042002 0.11799436
## IL33    0.2781940 0.1593937 0.09620652 0.09054758 0.11241967 0.10394656
## IL33_DA 0.2598021 0.1661099 0.12375749 0.12401408 0.10625613 0.09756779
##            Topic_7    Topic_8
## PBS     0.06895272 0.03166280
## IL33    0.11728610 0.04010939
## IL33_DA 0.08052668 0.03996384

plot

gp.list.box <- list()
gp.list.avg <- list()
gp.list.cum <- list()


for(top in tp){
  
  topic.data <- data.frame(weight = c(GEX.seur_override@meta.data[GEX.seur_override$SP.info=="PBS",top],
                                      GEX.seur_override@meta.data[GEX.seur_override$SP.info=="IL-33",top],
                                      GEX.seur_override@meta.data[GEX.seur_override$SP.info=="IL-33_DA",top]),
                           condition = c(rep("PBS", length(which(GEX.seur_override$SP.info=="PBS"))),
                                         rep("IL33", length(which(GEX.seur_override$SP.info=="IL-33"))),
                                         rep("IL33_DA", length(which(GEX.seur_override$SP.info=="IL-33_DA")))),
                           topic = rep(top, nrow(GEX.seur_override@meta.data)))
  topic.data$condition <- factor(topic.data$condition, levels = c("PBS","IL33","IL33_DA"))
  
  gp.list.box[[top]] <- ggplot(topic.data, aes(y=weight, x=condition, color=condition)) +
                    geom_boxplot() + scale_y_continuous(limits=c(0,max.nooutlier[[top]])) +
                    facet_wrap(~ topic, ncol=1) +
    scale_color_manual(values = c("#5A5C5F","#0000C8","#FDB911")) + 
                    theme_bw() + theme(panel.grid=element_blank(), 
                                      strip.background = element_rect(color = "white", fill = "white"),
                                      panel.border = element_blank(),
                                      axis.line = element_line(colour = "black"))
  
  avg.data <- as.data.frame(topic.avg.all[,top])
  avg.data$topic <- rep(top, nrow(avg.data))
  colnames(avg.data) <- c("average", "topic")
  
  
  gp.list.avg[[top]] <- ggplot(avg.data, aes(x=factor(rownames(avg.data),levels = c("PBS","IL33","IL33_DA")),
                                                      y=average, 
                                             color=factor(rownames(avg.data),levels = c("PBS","IL33","IL33_DA")))) +
                   geom_col(fill="white") + xlab('') +
                   theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_text(), strip.background = element_rect(color = "white", fill = "white")) +
                   facet_wrap(~ topic, ncol = 1) +
                   theme_bw() + theme(panel.grid=element_blank(), 
                                      strip.background = element_rect(color = "white", fill = "white"),
                                      panel.border = element_blank(),
                                      axis.line = element_line(colour = "black")) + 
                   scale_color_manual(values = c("#5A5C5F","#0000C8","#FDB911"),name="condition")
  
  gp.list.cum[[top]] <- ggplot(topic.data, aes(x=weight, color=condition)) +
                   stat_ecdf(geom="step", size = 1) +
                   scale_color_manual(values = c("#5A5C5F","#0000C8","#FDB911")) + 
                   facet_wrap(~ topic, ncol=1) + labs(y="")+
                   theme_bw() + theme(panel.grid =element_blank(), 
                                      strip.background = element_rect(color = "white", fill = "white"),
                                      panel.border = element_blank(),
                                      axis.line = element_line(colour = "black"))
  
}
cowplot::plot_grid(plotlist=gp.list.box, ncol=4)

cowplot::plot_grid(plotlist=gp.list.avg, ncol=4)

cowplot::plot_grid(plotlist=gp.list.cum, ncol=4)

compute p-values

between each two conditions

# PBS vs IL33
p.PBS_vs_IL33.wilcox <- lapply(tp, function(x){
  wilcox.test(GEX.seur_override@meta.data[GEX.seur_override$SP.info=="PBS",x],
              GEX.seur_override@meta.data[GEX.seur_override$SP.info=="IL-33",x])
})
names(p.PBS_vs_IL33.wilcox) <- tp
pmat.PBS_vs_IL33.wilcox <- sapply(tp, function(x) p.PBS_vs_IL33.wilcox[[x]]$p.value)


# PBS vs IL33_DA
p.PBS_vs_IL33DA.wilcox <- lapply(tp, function(x){
  wilcox.test(GEX.seur_override@meta.data[GEX.seur_override$SP.info=="PBS",x],
              GEX.seur_override@meta.data[GEX.seur_override$SP.info=="IL-33_DA",x])
})
names(p.PBS_vs_IL33DA.wilcox) <- tp
pmat.PBS_vs_IL33DA.wilcox <- sapply(tp, function(x) p.PBS_vs_IL33DA.wilcox[[x]]$p.value)


# IL33 vs IL33_DA
p.IL33DA_vs_IL33.wilcox <- lapply(tp, function(x){
  wilcox.test(GEX.seur_override@meta.data[GEX.seur_override$SP.info=="IL-33_DA",x],
              GEX.seur_override@meta.data[GEX.seur_override$SP.info=="IL-33",x])
})
names(p.IL33DA_vs_IL33.wilcox) <- tp
pmat.IL33DA_vs_IL33.wilcox <- sapply(tp, function(x) p.IL33DA_vs_IL33.wilcox[[x]]$p.value)


# combine pvalue
pval.conditions.wilcox <- data.frame(PBS_vs_IL33= pmat.PBS_vs_IL33.wilcox ,
                                     PBS_vs_IL33DA= pmat.PBS_vs_IL33DA.wilcox ,
                                     IL33_vs_IL33DA= pmat.IL33DA_vs_IL33.wilcox)

# combine padjust
padj.conditions.wilcox <- data.frame(PBS_vs_IL33= p.adjust(pmat.PBS_vs_IL33.wilcox,method = "BH"),
                                     PBS_vs_IL33DA= p.adjust(pmat.PBS_vs_IL33DA.wilcox,method = "BH"),
                                     IL33_vs_IL33DA= p.adjust(pmat.IL33DA_vs_IL33.wilcox,method = "BH"))

pmerge.conditions.wilcox <- cbind(pval.conditions.wilcox,padj.conditions.wilcox)
colnames(pmerge.conditions.wilcox) <- c(paste0(colnames(pval.conditions.wilcox),".pval"),
                                        paste0(colnames(padj.conditions.wilcox),".padj"))
pval.conditions.wilcox
##          PBS_vs_IL33 PBS_vs_IL33DA IL33_vs_IL33DA
## Topic_1 7.006054e-02  4.384706e-02   6.471731e-10
## Topic_2 5.683860e-11  8.000171e-06   8.978170e-04
## Topic_3 6.612976e-24  5.306788e-01   2.711401e-47
## Topic_4 1.582588e-06  6.085626e-01   1.442550e-15
## Topic_5 1.099733e-12  2.310336e-05   6.987039e-05
## Topic_6 3.987903e-06  4.372660e-11   2.116977e-03
## Topic_7 2.729652e-31  5.773879e-02   6.692069e-47
## Topic_8 2.730927e-05  6.214584e-07   3.220758e-01
padj.conditions.wilcox
##          PBS_vs_IL33 PBS_vs_IL33DA IL33_vs_IL33DA
## Topic_1 7.006054e-02  7.015530e-02   1.294346e-09
## Topic_2 1.136772e-10  2.133379e-05   1.197089e-03
## Topic_3 2.645191e-23  6.064901e-01   2.169121e-46
## Topic_4 2.532141e-06  6.085626e-01   3.846801e-15
## Topic_5 2.932621e-12  4.620671e-05   1.117926e-04
## Topic_6 5.317204e-06  3.498128e-10   2.419402e-03
## Topic_7 2.183722e-30  7.698505e-02   2.676828e-46
## Topic_8 3.121059e-05  2.485834e-06   3.220758e-01
pmerge.conditions.wilcox
##         PBS_vs_IL33.pval PBS_vs_IL33DA.pval IL33_vs_IL33DA.pval
## Topic_1     7.006054e-02       4.384706e-02        6.471731e-10
## Topic_2     5.683860e-11       8.000171e-06        8.978170e-04
## Topic_3     6.612976e-24       5.306788e-01        2.711401e-47
## Topic_4     1.582588e-06       6.085626e-01        1.442550e-15
## Topic_5     1.099733e-12       2.310336e-05        6.987039e-05
## Topic_6     3.987903e-06       4.372660e-11        2.116977e-03
## Topic_7     2.729652e-31       5.773879e-02        6.692069e-47
## Topic_8     2.730927e-05       6.214584e-07        3.220758e-01
##         PBS_vs_IL33.padj PBS_vs_IL33DA.padj IL33_vs_IL33DA.padj
## Topic_1     7.006054e-02       7.015530e-02        1.294346e-09
## Topic_2     1.136772e-10       2.133379e-05        1.197089e-03
## Topic_3     2.645191e-23       6.064901e-01        2.169121e-46
## Topic_4     2.532141e-06       6.085626e-01        3.846801e-15
## Topic_5     2.932621e-12       4.620671e-05        1.117926e-04
## Topic_6     5.317204e-06       3.498128e-10        2.419402e-03
## Topic_7     2.183722e-30       7.698505e-02        2.676828e-46
## Topic_8     3.121059e-05       2.485834e-06        3.220758e-01

signature score

load DEGs

from Lung ILC2 SS2 data ‘DA vs PBS’ (both have IL33)

DEGs.LungILC2_DAvsPBS <- read.csv("SS2_data/DESeq2_DEGs_20210722.DA_vs_PBS.csv")
rownames(DEGs.LungILC2_DAvsPBS) <- DEGs.LungILC2_DAvsPBS$gene
head(DEGs.LungILC2_DAvsPBS)
##              gene  baseMean log2FoldChange     lfcSE      stat   pvalue
## AA467197 AA467197  44.95977      -3.829955 0.5962771 -6.423112 1.34e-10
## Bmp2         Bmp2  35.92400       3.504667 0.5794070  6.048713 1.46e-09
## Ccnb2       Ccnb2  74.74773      -2.815492 0.4675964 -6.021200 1.73e-09
## Rasa3       Rasa3 119.95292       2.220907 0.3702769  5.997963 2.00e-09
## Mrps12     Mrps12  49.92089      -2.544398 0.4440691 -5.729734 1.01e-08
## Nr1d2       Nr1d2  46.45192       2.761307 0.4808902  5.742073 9.35e-09
##              padj         FC
## AA467197 5.75e-07 -14.221036
## Bmp2     3.44e-06  11.350364
## Ccnb2    3.44e-06  -7.039591
## Rasa3    3.44e-06   4.661865
## Mrps12   1.24e-05  -5.833646
## Nr1d2    1.24e-05   6.780100
DEGs.DA_up <- (DEGs.LungILC2_DAvsPBS %>% filter(padj < 0.05 & log2FoldChange > 1))$gene
DEGs.DA_dn <- (DEGs.LungILC2_DAvsPBS %>% filter(padj < 0.05 & log2FoldChange < -1))$gene

sig. score

using code on
https://github.com/chendianyu/2021_NI_scGCB/blob/main/script/Signature_score.Rmd
which is from Adam Hamber’s

## The code below is from Adam Hamber
## 2D scoring by Itay
get_controls <- function(counts, gene.list, verbose=F, control.genes.per.gene=10)
{
    # Itay: "Such scores are inevitably correlated with cell complexity so to avoid 
    # that I subtract a "control" score which is generated by averaging over a control 
    # gene set. Control gene sets are chosen to contain 100 times more genes than the 
    # real gene set (analogous to averaging over 100 control sets of similar size) and 
    # to have the same distribution of population/bulk - based expression levels as the 
    # real gene set, such that they are expected to have the same number of "zeros" and 
    # to eliminate the correlation with complexity."
    # ---------------------------------------------------------------------------------
    # Going to find control points by finding the closest genes in terms of expression level and % of the time we observe it
    if(verbose){cat(sprintf("Finding %s background genes based on similarity to given gene set [%s genes] \n", 
                            control.genes.per.gene*length(gene.list), length(gene.list)))}
    cat("Summarizing data \n")
    summary = data.frame(gene=row.names(counts), mean.expr = Matrix::rowMeans(counts), fract.zero = Matrix::rowMeans(counts==0), stringsAsFactors = F)
    #summary = data.frame(gene=row.names(counts), mean.expr = apply(counts,1,mean), fract.zero = apply(counts==0,1,mean), stringsAsFactors = F)
    summary$mean.expr.s = scale(summary$mean.expr)
    summary$fract.zero.s = scale(summary$fract.zero)
    actual.genes = summary[summary$gene %in% gene.list,]
    background.genes = summary[!summary$gene %in% gene.list,]
    
    #find the 10 closest genes to each cell cycle marker gene and add them to the lists of control genes
    get_closest_genes <- function(i)
    {
        background.genes$dist = sqrt((background.genes$mean.expr.s - actual.genes$mean.expr.s[i])^2 + 
                                         (background.genes$fract.zero.s - actual.genes$fract.zero.s[i])^2)
        ordered = background.genes$gene[order(background.genes$dist)]
        ordered = ordered[!ordered %in% controls] # don't take genes that already appear in the list 
        closest = head(ordered, n=control.genes.per.gene)
        return(closest)
    }
    controls = c();
    
    for (i in 1:length(gene.list)){
        #info(sprintf("Finding %s control genes for %s", control.genes.per.gene, gene.list[i]))
        closest = get_closest_genes(i)
        #info(sprintf("Found %s: ", length(closest)))
        controls = unique(c(controls, closest))
    }
    
    if(verbose){cat(sprintf("Control gene selection complete. %s genes found. \n", length(controls)))}
    #print(controls)
    return(controls)
}

## Define calculate function
calculate_signature_score <- function(count_matrix, gene_list){
    control_gene <- get_controls(counts = count_matrix,
                                 gene.list = gene_list)
    signature_score <- colMeans(count_matrix[gene_list, ], na.rm = TRUE) - 
        colMeans(count_matrix[control_gene, ], na.rm = TRUE)
    return(signature_score)
}
score.DA_up <- calculate_signature_score(as.data.frame(GEX.seur@assays[['RNA']]@data),
                                         DEGs.DA_up)
## Summarizing data
#score.DA_up
score.DA_dn <- calculate_signature_score(as.data.frame(GEX.seur@assays[['RNA']]@data),
                                         DEGs.DA_dn)
## Summarizing data
#score.DA_dn
GEX.seur_score <- GEX.seur
GEX.seur_score <- AddMetaData(GEX.seur_score,
                              score.DA_up,
                              "DAup_score")
GEX.seur_score <- AddMetaData(GEX.seur_score,
                              score.DA_dn,
                              "DAdn_score")
## violin plot
vln.DA_up <- GEX.seur_score@meta.data %>%
    ggplot(aes(SP.info, DAup_score, color = SP.info)) +
    geom_violin(draw_quantiles=c(0.25,0.75),trim = FALSE) +
    #ylim(c(-0.25,0.35)) +
    ggbeeswarm::geom_quasirandom(size = 0.01, width = 0.2, alpha = 0.3) +
    stat_summary(fun.y=mean, geom="point", shape=18, size=3, color="black") +
    ggpubr::stat_compare_means(aes(lable = ..p.signif..), 
                               method = "wilcox.test",
                               comparisons = list(c("IL-33","IL-33_DA"),
                                                  c("IL-33","PBS"),
                                                  c("PBS","IL-33_DA"))) +
      scale_color_manual(values = c("#5A5C5F","#0000C8","#FDB911")) +
    labs(x="") +
    theme(panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(), 
          panel.background = element_blank(), 
          panel.border = element_rect(fill = NA))
vln.DA_up + NoLegend()

## violin plot
vln.DA_dn <- GEX.seur_score@meta.data %>%
    ggplot(aes(SP.info, DAdn_score, color = SP.info)) +
    geom_violin(draw_quantiles=c(0.25,0.75),trim = FALSE) +
    #ylim(c(-0.25,0.35)) +
    ggbeeswarm::geom_quasirandom(size = 0.01, width = 0.2, alpha = 0.3) +
    stat_summary(fun.y=mean, geom="point", shape=18, size=3, color="black") +
    ggpubr::stat_compare_means(aes(lable = ..p.signif..), 
                               method = "wilcox.test",
                               comparisons = list(c("IL-33","IL-33_DA"),
                                                  c("IL-33","PBS"),
                                                  c("PBS","IL-33_DA"))) +
    scale_color_manual(values = c("#5A5C5F","#0000C8","#FDB911")) +
    labs(x="") +
    theme(panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(), 
          panel.background = element_blank(), 
          panel.border = element_rect(fill = NA))
vln.DA_dn + NoLegend()